Business Task: Analyze Fitbit data to gain insight and help guide marketing strategy for Bellabeat to grow as a global player.
Data can be found on Kaggle. The notebook consist of 18 data sets. To determine the credibility and integrity of the data, I will use the “ROCCC” system.
Notes: setting up my environment by installing and loading the ‘tidyverse’, ‘skimr’, ‘plotly’, ‘dplyr’, and ‘ggplot2’ packages
install.packages('tidyverse')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages('skimr')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages('plotly')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages('dplyr')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages('ggplot2')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(skimr)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(readr)
library(dplyr)
Notes: The “Daily Activity” and “Sleep Day” data sets have been selected for our data analysis
# Import the DailyActivity and SleepDay data sets
daily_activity <- read.csv("Fitabase/dailyActivity_merged.csv")
sleep_activity <- read.csv("Fitabase/sleepDay_merged.csv")
Notes: I will check for NA values in the data sets and remove them
head(daily_activity)
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366 4/12/2016 13162 8.50 8.50
## 2 1503960366 4/13/2016 10735 6.97 6.97
## 3 1503960366 4/14/2016 10460 6.74 6.74
## 4 1503960366 4/15/2016 9762 6.28 6.28
## 5 1503960366 4/16/2016 12669 8.16 8.16
## 6 1503960366 4/17/2016 9705 6.48 6.48
## LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1 0 1.88 0.55
## 2 0 1.57 0.69
## 3 0 2.44 0.40
## 4 0 2.14 1.26
## 5 0 2.71 0.41
## 6 0 3.19 0.78
## LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1 6.06 0 25
## 2 4.71 0 21
## 3 3.91 0 30
## 4 2.83 0 29
## 5 5.04 0 36
## 6 2.51 0 38
## FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1 13 328 728 1985
## 2 19 217 776 1797
## 3 11 181 1218 1776
## 4 34 209 726 1745
## 5 10 221 773 1863
## 6 20 164 539 1728
head(sleep_activity)
## Id SleepDay TotalSleepRecords TotalMinutesAsleep
## 1 1503960366 4/12/2016 12:00:00 AM 1 327
## 2 1503960366 4/13/2016 12:00:00 AM 2 384
## 3 1503960366 4/15/2016 12:00:00 AM 1 412
## 4 1503960366 4/16/2016 12:00:00 AM 2 340
## 5 1503960366 4/17/2016 12:00:00 AM 1 700
## 6 1503960366 4/19/2016 12:00:00 AM 1 304
## TotalTimeInBed
## 1 346
## 2 407
## 3 442
## 4 367
## 5 712
## 6 320
# Inspecting column names
colnames(daily_activity)
## [1] "Id" "ActivityDate"
## [3] "TotalSteps" "TotalDistance"
## [5] "TrackerDistance" "LoggedActivitiesDistance"
## [7] "VeryActiveDistance" "ModeratelyActiveDistance"
## [9] "LightActiveDistance" "SedentaryActiveDistance"
## [11] "VeryActiveMinutes" "FairlyActiveMinutes"
## [13] "LightlyActiveMinutes" "SedentaryMinutes"
## [15] "Calories"
colnames(sleep_activity)
## [1] "Id" "SleepDay" "TotalSleepRecords"
## [4] "TotalMinutesAsleep" "TotalTimeInBed"
# Checking the total number of users for each data set. We see we have 33 IDs for daily_activity and 24 IDs for sleep_activity
n_distinct(daily_activity$Id)
## [1] 33
n_distinct(sleep_activity$Id)
## [1] 24
# Checking for duplicate entries
nrow(daily_activity[duplicated(daily_activity),])
## [1] 0
nrow(sleep_activity[duplicated(sleep_activity),])
## [1] 3
#Removing duplicated entries
sleep_activity <- unique(sleep_activity)
Notes: I will transform data in the data sets to help with visualizing the data
# Separate the day and time in sleep_activity
sleep_activity <- sleep_activity %>% separate(SleepDay, c("Date", "Time"), " ")
## Warning: Expected 2 pieces. Additional pieces discarded in 410 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Create a weekday column for daily_activity
daily_activity <- daily_activity %>% mutate(Weekday = weekdays(as.Date(ActivityDate, "%m/%d/%Y")))
# Putting the weekdays in order From Monday thru Sunday
daily_activity$Weekday <-factor(daily_activity$Weekday, levels= c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
# Given summary of each data set
daily_activity %>% select(TotalSteps, TotalDistance, SedentaryMinutes, LightlyActiveMinutes, FairlyActiveMinutes, VeryActiveMinutes, Calories) %>% summary()
## TotalSteps TotalDistance SedentaryMinutes LightlyActiveMinutes
## Min. : 0 Min. : 0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.: 3790 1st Qu.: 2.620 1st Qu.: 729.8 1st Qu.:127.0
## Median : 7406 Median : 5.245 Median :1057.5 Median :199.0
## Mean : 7638 Mean : 5.490 Mean : 991.2 Mean :192.8
## 3rd Qu.:10727 3rd Qu.: 7.713 3rd Qu.:1229.5 3rd Qu.:264.0
## Max. :36019 Max. :28.030 Max. :1440.0 Max. :518.0
## FairlyActiveMinutes VeryActiveMinutes Calories
## Min. : 0.00 Min. : 0.00 Min. : 0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:1828
## Median : 6.00 Median : 4.00 Median :2134
## Mean : 13.56 Mean : 21.16 Mean :2304
## 3rd Qu.: 19.00 3rd Qu.: 32.00 3rd Qu.:2793
## Max. :143.00 Max. :210.00 Max. :4900
sleep_activity %>% select(TotalSleepRecords, TotalMinutesAsleep, TotalTimeInBed) %>% summary()
## TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## Min. :1.00 Min. : 58.0 Min. : 61.0
## 1st Qu.:1.00 1st Qu.:361.0 1st Qu.:403.8
## Median :1.00 Median :432.5 Median :463.0
## Mean :1.12 Mean :419.2 Mean :458.5
## 3rd Qu.:1.00 3rd Qu.:490.0 3rd Qu.:526.0
## Max. :3.00 Max. :796.0 Max. :961.0